Summary

Exploring associations of miRNA expression levels across tumor types in TCGA with suggested correlates defined by the Immune Response Working Group:

  • Overall Leukocyte fraction
  • Individual Relative CIBERSORT fraction
  • Mutation Load
  • TCR,BCR Diversity
  • Expression of
    • IFNG,IFN-gamma
    • PRF1,Perforin
    • GZMA,Granzyme A
    • PDCD1,PD-1
    • CD274,PD-L1
    • PDCD1LG2,PD-L2
    • IL10,IL-10
    • TGFB1,TGF-beta
    • IDO1,IDO
    • HLA-A

 

Prepare data

Retrieve miRNA data from Synapse

Data are stored on Synapse in the folder syn6171109.

Load miRNA sample data

Sample characteristics are stored in a tab-delimited text file (Synapse ID: syn7222010) and can be loaded with read_tsv().

# load sample data
mirna_sample_file <- mirna_files %>% 
    filter(file.id == "syn7222010") %>% 
    .[["file_path"]]
mirna_sample_df <- read_tsv(mirna_sample_file)
Parsed with column specification:
cols(
  id = col_character(),
  Disease = col_character(),
  Sample_Type = col_integer(),
  Protocol = col_character(),
  Platform = col_character()
)

Sample filtering

Sample Quality Annotations: syn4551248 (“merged_sample_quality_annotations.tsv”)

sample_qual_file <- synGet("syn4551248", downloadLocation = "../data/tcga/")
sample_qual_df <- sample_qual_file %>% 
    getFileLocation() %>% 
    read_tsv()
Parsed with column specification:
cols(
  patient_barcode = col_character(),
  aliquot_barcode = col_character(),
  `cancer type` = col_character(),
  platform = col_character(),
  patient_annotation = col_character(),
  sample_annotation = col_character(),
  aliquot_annotation = col_character(),
  aliquot_annotation_updated = col_character(),
  AWG_excluded_because_of_pathology = col_double(),
  AWG_pathology_exclusion_reason = col_character(),
  Reviewed_by_EPC = col_double(),
  Do_not_use = col_character()
)

remove samples based on Do_not_use=True, and remove cases with AWG_excluded_because_of_pathology=True

# samples to exclude from all datasets
exclude_samples <- sample_qual_df %>% 
    mutate(AWG_excluded_because_of_pathology = parse_logical(AWG_excluded_because_of_pathology),
           Do_not_use = parse_logical(str_to_upper(Do_not_use))) %>% 
    filter(AWG_excluded_because_of_pathology | Do_not_use)

Remove samples from miRNA dataset.

mirna_sample_df <- mirna_sample_df %>% 
    filter(!(id %in% exclude_samples$aliquot_barcode))

 

Load miRNA expression data

miRNA normalized, batch corrected expression values for all samples are stored as a matrix in a CSV file (Synapse ID: syn7201053) and can be loaded with read_csv().

mirna_corr_file <- "../data/intermediate/mirna_correlate_data.feather"
force_read <- FALSE
if (!file.exists(mirna_corr_file) | force_read) {
    # load normalized, batch-corrected expression data
    mirna_norm_file <- mirna_files %>% 
        filter(file.id == "syn7201053") %>% 
        .[["file_path"]]
    mirna_norm_df <- read_csv(mirna_norm_file, progress = FALSE)
    
    mirna_corr_df <- mirna_norm_df %>% 
        select(one_of(c("Genes", mirna_sample_pt_df$id)))
    
    write_feather(mirna_corr_df, mirna_corr_file)
    rm(mirna_norm_df)
} else {
    mirna_corr_df <- read_feather(mirna_corr_file)
}

 

Set up plotting colors

tcga_colors <- tribble(
    ~Color, ~Disease,
    "#ED2891", "BRCA",
    "#B2509E", "GBM",
    "#D49DC7", "LGG",
    "#C1A72F", "ACC",
    "#E8C51D", "PCPG",
    "#F9ED32", "THCA",
    "#104A7F", "CHOL",
    "#9EDDF9", "COAD",
    "#007EB5", "ESCA",
    "#CACCDB", "LIHC",
    "#6E7BA2", "PAAD",
    "#DAF1FC", "READ",
    "#00AEEF", "STAD",
    "#F6B667", "CESC",
    "#D97D25", "OV",
    "#FBE3C7", "UCEC",
    "#F89420", "UCS",
    "#97D1A9", "HNSC",
    "#009444", "UVM",
    "#754C29", "LAML",
    "#CEAC8F", "THYM",
    "#3953A4", "DLBC",
    "#BBD642", "SKCM",
    "#00A99D", "SARC",
    "#D3C3E0", "LUAD",
    "#A084BD", "LUSC",
    "#542C88", "MESO",
    "#FAD2D9", "BLCA",
    "#ED1C24", "KICH",
    "#F8AFB3", "KIRC",
    "#EA7075", "KIRP",
    "#7E1918", "PRAD",
    "#BE1E2D", "TGCT"
)
mirna_sample_df <- mirna_sample_df %>% 
    mutate(Disease = factor(Disease, levels = tcga_colors$Disease))

 

Explore miRNA data

Sample characteristics

The table mirna_sample_df contains 5 columns describing the 10543 samples in the data. The Sample_Type column corresponds to TCGA sample type codes, which are defined here. The following sample types are included in the miRNA data:

mirna_sample_df %>% 
    group_by(Sample_Type) %>% 
    tally()

 

Based on the codes, this is the distribution of sample types:

mirna_sample_df %>% 
    group_by(Sample_Type) %>% 
    tally() %>% 
    mutate(Definition = case_when(
        .$Sample_Type == 1 ~ "Primary Solid Tumor",
        .$Sample_Type == 2 ~ "Recurrent Solid Tumorr",
        .$Sample_Type == 3 ~ "Primary Blood Derived Cancer - Peripheral Blood",
        .$Sample_Type == 5 ~ "Additional - New Primary",
        .$Sample_Type == 6 ~ "Metastatic",
        .$Sample_Type == 7 ~ "Additional Metastatic",
        .$Sample_Type == 11 ~ "Solid Tissue Normal"
    ))

Note: only include “primary” tumor samples for now; also, add additional column to store vial ID (to ease mapping between data sets).

mirna_sample_pt_df <- mirna_sample_df %>% 
    filter(Sample_Type %in% c(1, 3, 5)) %>% 
    mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", ""))

 

Additionally, the following disease types are included:

mirna_sample_pt_df %>% 
    group_by(Disease) %>% 
    tally()

Note: exclude LAML, THYM, and DLBC from cell content correlations

 

And technical variables:

mirna_sample_pt_df %>% 
    group_by(Protocol, Platform) %>% 
    tally()

 

miRNA expression values

Expression values in the data are reportedly reads per million (RPM). I’ve randomly selected a few samples from each disease type, sample type, protocol, and platform to inspect the distribution of expression values across all 743 miRNA genes.

set.seed(0)
# randomly select 1-3 samples from each combination of characteristics
sample_sub_df <- mirna_sample_pt_df %>% 
    group_by(Disease, Sample_Type, Protocol, Platform) %>% 
    sample_n(3, replace = TRUE) %>% 
    ungroup() %>% 
    distinct()
# subset and melt the expression data
mirna_sub_df <- mirna_corr_df %>% 
    select(one_of(c("Genes", sample_sub_df$id))) %>%
    gather("sample", "expression", -Genes)

 

Even with a shifted log (log10(x + 1)) transformation, expression values look to be more exponentially distributed than normal.

mirna_sub_df %>% 
    left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>% 
    ggplot(aes(x = log10(expression + 1))) +
    stat_density(aes(group = sample), geom = "line", position = "identity", 
                 alpha = 0.2)

 

I can also look at the distribution of log-RPM values with boxplots:

mirna_sub_df %>% 
    left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>% 
    ggplot(aes(x = sample, y = log10(expression + 1))) +
    geom_boxplot(aes(fill = Disease), outlier.size = 0.5) +
    theme(axis.text.x = element_blank()) +
    scale_fill_manual(values = tcga_colors$Color)

# convert expression df to matrix
mirna_corr_mat <- mirna_corr_df %>% 
    select(one_of(c("Genes", mirna_sample_pt_df$id))) %>% 
    column_to_rownames("Genes") %>% 
    as.matrix()

 

PCA

I used the prcomp() function on the transposed expression matrix (samples x genes, after transposing) to compute PCA data, which I can use to look for batch effects among samples.

mirna_corr_pca <- mirna_corr_mat %>% 
    t() %>% 
    prcomp()

(I can use the broom:tidy() function to convert data in the prcomp object into data frames for ggplot.)

pc_df <- mirna_corr_pca %>% 
    tidy("pcs")
mirna_corr_pca_df <- mirna_corr_pca %>% 
    tidy("samples") %>% 
    filter(PC <= 2) %>% 
    left_join(mirna_sample_pt_df, by = c("row" = "id"))
rm(mirna_corr_mat)

The plot below shows samples plotted as points along the first two principle components (PCs). Points are colored by disease type.

pc1_label <- pc_df %>% 
    filter(PC == 1) %>% 
    transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>% 
    flatten_chr()
pc2_label <- pc_df %>% 
    filter(PC == 2) %>% 
    transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>% 
    flatten_chr()
mirna_corr_pca_df %>% 
    spread(PC, value) %>% 
    ggplot(aes(x = `1`, y = `2`)) +
    geom_point(aes(colour = Disease)) +
    xlab(pc1_label) +
    ylab(pc2_label) +
    scale_color_manual(values = tcga_colors$Color)


 

Prepare correlate data

  • Overall Leukocyte fraction
  • Individual Relative CIBERSORT fraction
  • Mutation Load
  • TCR,BCR Diversity
  • Gene Expression

Leukocyte fraction

Retrieve/load data

Cellular Content: syn7994728 syn5808205 (“TCGA_all_leuk_estimate.masked.20170107.tsv”)

# file_data <- synGet("syn5808205", downloadLocation = "./")
leuk_frac_file <- "../data/tcga/TCGA_all_leuk_estimate.masked.20170107.tsv"
leuk_frac_df <- read_tsv(leuk_frac_file, col_names = FALSE) %>% 
    set_names(c("disease", "id", "leuk_frac"))
Parsed with column specification:
cols(
  X1 = col_character(),
  X2 = col_character(),
  X3 = col_double()
)

Sample filtering

leuk_frac_df <- leuk_frac_df %>%
    filter(!(id %in% exclude_samples$aliquot_barcode))

Sample matching

Identify matched samples between miRNA and leukocyte fraction.

leuk_frac_ids <- leuk_frac_df %>% 
    select(id) %>%
    mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>% 
    arrange()
mirna_ids <- mirna_sample_pt_df %>%
    select(id, vial_id) %>%
    arrange()
mirna_leuk_frac_shared_ids <- inner_join(mirna_ids, leuk_frac_ids, 
                                         by = "vial_id",
                                         suffix = c("_mirna", "_leuk_frac"))
# only keep samples with matched vial ID AND portion number
portion_id_minus_analyte_regex <- "([:alnum:]+\\-){4}[0-9]+"
# plate_id_only_regex <- "[:alnum:]+(?=([:alnum:]\\-[:alnum:]+){1}$)"
mirna_leuk_frac_shared_ids <- mirna_leuk_frac_shared_ids %>%
    filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
            == str_extract(id_leuk_frac, portion_id_minus_analyte_regex)))

NOTE: several samples were assayed on multiple plates; average the leukocyte fraction across these before computing correlations with miRNA

Correlate data formatting

leuk_frac_corr_file <- "../data/intermediate/leuk_frac_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(leuk_frac_corr_file) | force_format) {
    leuk_frac_corr_df <- mirna_leuk_frac_shared_ids %>% 
        left_join(leuk_frac_df, by = c("id_leuk_frac" = "id")) %>% 
        group_by(disease, vial_id) %>% 
        summarise(leuk_frac = mean(leuk_frac)) %>% 
        ungroup()
    
    write_feather(leuk_frac_corr_df, leuk_frac_corr_file)
} else {
    leuk_frac_corr_df <- read_feather(leuk_frac_corr_file)
}

Disease-wise correlations

mirna_leuk_frac_corr_file <- "../results/mirna_leuk_frac_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_leuk_frac_corr_file) | force_compute) {
    
    # skip immune cell cancers?
    d_list <- mirna_sample_pt_df %>% 
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>% 
        filter(Disease %in% leuk_frac_corr_df$disease) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(leuk_frac_corr_df$vial_id)
        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))
        
        target_df <- leuk_frac_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            dplyr::rename(sample = vial_id, x = leuk_frac) %>% 
            mutate(correlate = "leuk_frac")
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_leuk_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "leukocyte fraction")
    write_feather(mirna_leuk_frac_corr_df, mirna_leuk_frac_corr_file)
} else {
    mirna_leuk_frac_corr_df <- read_feather(mirna_leuk_frac_corr_file)
}

 

CIBERSORT fraction

Retrieve/load data

Cellular Content: syn4991611 syn8024565 (“TCGA.cluster-by-CIBERSORT-relative.tsv”)

or…?

syn7337221 (“TCGA.Kallisto.fullIDs.cibersort.relative.tsv”)

# ciber_frac_file <- "../data/tcga/TCGA.cluster-by-CIBERSORT-relative.tsv"
ciber_frac_file <- "../data/tcga/TCGA.Kallisto.fullIDs.cibersort.relative.tsv"
ciber_frac_df <- read_tsv(ciber_frac_file)
Parsed with column specification:
cols(
  .default = col_double(),
  SampleID = col_character(),
  CancerType = col_character()
)
See spec(...) for full column specifications.

Sample filtering

ciber_frac_df <- ciber_frac_df %>%
    mutate(id = str_replace_all(SampleID, "\\.", "\\-")) %>% 
    filter(!(id %in% exclude_samples$aliquot_barcode))

Sample matching

Identify matched samples between miRNA and CIBERSORT fraction.

ciber_frac_ids <- ciber_frac_df %>% 
    select(id) %>%
    mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>% 
    arrange()
# only keep samples with matched vial ID AND portion number
mirna_ciber_frac_shared_ids <- inner_join(mirna_ids, ciber_frac_ids,
                                         by = "vial_id",
                                         suffix = c("_mirna", "_ciber_frac")) %>% 
    distinct() %>% 
    filter((str_extract(id_mirna, portion_id_minus_analyte_regex) 
            == str_extract(id_ciber_frac, portion_id_minus_analyte_regex)))

NOTE: several samples were assayed on multiple plates; average the CIBERSORT fraction across these before computing correlations with miRNA

Correlate data formatting

ciber_frac_corr_file <- "../data/intermediate/ciber_frac_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(ciber_frac_corr_file) | force_format) {
    ciber_frac_corr_df <- mirna_ciber_frac_shared_ids %>%
        left_join(ciber_frac_df, by = c("id_ciber_frac" = "id")) %>% 
        select(-id_mirna, -id_ciber_frac, 
               -SampleID, -P.value, -Correlation, -RMSE) %>% 
        group_by(CancerType, vial_id) %>%
        summarise_each(funs(mean)) %>% 
        ungroup()
    
    write_feather(ciber_frac_corr_df, ciber_frac_corr_file)
} else {
    ciber_frac_corr_df <- read_feather(ciber_frac_corr_file)
}

Disease-wise correlations

mirna_ciber_frac_corr_file <- "../results/mirna_ciber_frac_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_ciber_frac_corr_file) | force_compute) {
    
    # skip immune cell cancers?
    d_list <- mirna_sample_pt_df %>%
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>% 
        filter(Disease %in% ciber_frac_corr_df$CancerType) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(ciber_frac_corr_df$vial_id)
        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))
        target_df <- ciber_frac_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            select(-CancerType) %>% 
            gather(correlate, x, -vial_id) %>% 
            dplyr::rename(sample = vial_id)
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_ciber_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "CIBERSORT fraction")
    write_feather(mirna_ciber_frac_corr_df, mirna_ciber_frac_corr_file)
} else {
    mirna_ciber_frac_corr_df <- read_feather(mirna_ciber_frac_corr_file)
}

 

Mutation load

Retrieve/load data

Mutation Load: syn5706827 syn7994728 (“mutation-load”)

  • exclude LAML?
mut_load_file <- "../data/tcga/mutation-load-vial.txt"
mut_load_df <- read_tsv(mut_load_file)
Parsed with column specification:
cols(
  cohort = col_character(),
  Patient_ID = col_character(),
  Tumor_Sample_ID = col_character(),
  Tumor_Sample_ID_Vial = col_character(),
  `Silent per Mb` = col_double(),
  `Non-silent per Mb` = col_double()
)

Sample filtering

Can’t filter aliquots, as data only includes IDs specific to the level of vial

Sample matching

Identify matched samples between miRNA and mutational load.

mut_load_ids <- mut_load_df %>% 
    select(id = Tumor_Sample_ID_Vial) %>% 
    mutate(vial_id = id) %>%
    arrange()
mirna_mut_load_shared_ids <- inner_join(mirna_ids, mut_load_ids, 
                                        by = "vial_id",
                                        suffix = c("_mirna", "_mut_load"))

Correlate data formatting

mut_load_corr_file <- "../data/intermediate/mut_load_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(mut_load_corr_file) | force_format) {
    mut_load_corr_df <- mirna_mut_load_shared_ids %>% 
        left_join(mut_load_df, by = c("id_mut_load" = "Tumor_Sample_ID_Vial"))
    
    write_feather(mut_load_corr_df, mut_load_corr_file)
} else {
    mut_load_corr_df <- read_feather(mut_load_corr_file)
}

Disease-wise correlations

mirna_mut_load_corr_file <- "../results/mirna_mut_load_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_mut_load_corr_file) | force_compute) {
    
    # skip immune cell cancers?
    d_list <- mirna_sample_pt_df %>%
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
        filter(Disease %in% mut_load_corr_df$cohort) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(mut_load_corr_df$vial_id)
        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))
        target_df <- mut_load_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            select(-id_mirna, -id_mut_load, -cohort, -Patient_ID, 
                   -Tumor_Sample_ID) %>% 
            gather(correlate, x, -vial_id) %>% 
            dplyr::rename(sample = vial_id)
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_mut_load_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "mutation load")
    write_feather(mirna_mut_load_corr_df, mirna_mut_load_corr_file)
} else {
    mirna_mut_load_corr_df <- read_feather(mirna_mut_load_corr_file)
}

 

TCR/BCR diversity

Retrieve/load data

T Cell Receptor / Brown et al: syn5876488 syn7063422 (“mitcr_sampleStatistics_20160714.tsv”)

tcr_div_file <- "../data/tcga/mitcr_sampleStatistics_20160714.tsv"
tcr_div_df <- read_tsv(tcr_div_file)
Parsed with column specification:
cols(
  AliquotBarcode = col_character(),
  Study = col_character(),
  SampleTypeLetterCode = col_character(),
  ParticipantBarcode = col_character(),
  SampleBarcode = col_character(),
  CGHub_analysis_id = col_character(),
  totTCR_reads = col_integer(),
  totTCRa_reads = col_integer(),
  totTCRb_reads = col_integer(),
  shannon = col_double(),
  numClones = col_integer()
)

Sample filtering

tcr_dv_df <- tcr_div_df %>%
    filter(!(AliquotBarcode %in% exclude_samples$aliquot_barcode))

Sample matching

Identify matched samples between miRNA and CIBERSORT fraction.

tcr_div_ids <- tcr_div_df %>% 
    select(id = AliquotBarcode, vial_id = SampleBarcode) %>% 
    arrange()
# only keep samples with matched vial ID AND portion number
mirna_tcr_div_shared_ids <- inner_join(mirna_ids, tcr_div_ids,
                                       by = "vial_id",
                                       suffix = c("_mirna", "_tcr_div")) %>% 
    distinct() %>% 
    filter((str_extract(id_mirna, portion_id_minus_analyte_regex) 
            == str_extract(id_tcr_div, portion_id_minus_analyte_regex)))

NOTE: several samples were assayed on multiple plates; average the TCR diversity across these before computing correlations with miRNA

Correlate data formatting

Some samples (AliquotBarcode) have 2 values for Shannon entropy, apparently corresponding to separate analyses on CGHub. I’ll go ahead and average these values per aliquot, prior to averaging Shannon values per vial ID. As a small measure of QC, I’ll also only keep observations with at least 1 TCRA read or 1 TCRB read.

tcr_div_corr_file <- "../data/intermediate/tcr_div_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(tcr_div_corr_file) | force_format) {
    tcr_div_corr_df <- mirna_tcr_div_shared_ids %>%
        left_join(tcr_div_df, by = c("id_tcr_div" = "AliquotBarcode")) %>%
        select(-CGHub_analysis_id) %>% 
        distinct() %>% 
        filter(totTCRa_reads > 0 | totTCRb_reads > 0) %>%
        group_by(Study, vial_id, id_tcr_div) %>%
        summarize(shannon = mean(shannon)) %>%
        group_by(Study, vial_id) %>%
        summarise(shannon = mean(shannon)) %>% 
        ungroup()
    write_feather(tcr_div_corr_df, tcr_div_corr_file)
} else {
    tcr_div_corr_df <- read_feather(tcr_div_corr_file)
}

Disease-wise correlations

mirna_tcr_div_corr_file <- "../results/mirna_tcr_div_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_tcr_div_corr_file) | force_compute) {
    
    # skip immune cell cancers
    d_list <- mirna_sample_pt_df %>%
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>% 
        filter(Disease %in% tcr_div_corr_df$Study) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(tcr_div_corr_df$vial_id)
        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))
        target_df <- tcr_div_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            select(-Study) %>% 
            dplyr::rename(sample = vial_id) %>% 
            gather(correlate, x, -sample)
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_tcr_div_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "TCR diversity")
    write_feather(mirna_tcr_div_corr_df, mirna_tcr_div_corr_file)
} else {
    mirna_tcr_div_corr_df <- read_feather(mirna_tcr_div_corr_file)
}

 

Gene expression

  • Expression of
    • IFNG,IFN-gamma
    • PRF1,Perforin
    • GZMA,Granzyme A
    • PDCD1,PD-1
    • CD274,PD-L1
    • PDCD1LG2,PD-L2
    • IL10,IL-10
    • TGFB1,TGF-beta
    • IDO1,IDO
    • HLA-A

Retrieve/load mRNA data

Batch effects normalized mRNA data: syn4976363 syn4976369 (“EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv”) syn4976366 (" EB++GeneExpAnnotation.tsv“)

Sample characteristics are stored in a tab-delimited text file (Synapse ID: syn4976366) and can be loaded with read_tsv().

# load sample data
mrna_sample_file <- mrna_files %>% 
    filter(file.id == "syn4976366") %>% 
    .[["file_path"]]
mrna_sample_df <- read_tsv(mrna_sample_file)
Parsed with column specification:
cols(
  SampleID = col_character(),
  Center = col_character(),
  platform = col_character(),
  Adjustment = col_character()
)

Sample filtering

Remove samples from mRNA dataset.

mrna_sample_df <- mrna_sample_df %>% 
    filter(!(SampleID %in% exclude_samples$aliquot_barcode))

Sample matching

Identify matched samples between miRNA and mRNA.

mrna_ids <- mrna_sample_df %>% 
    select(SampleID) %>%
    mutate(vial_id = str_replace(SampleID, "(\\-[:alnum:]+){3}$", "")) %>% 
    arrange()
mirna_mrna_shared_ids <- inner_join(mirna_ids, mrna_ids, by = "vial_id")
# only keep samples with matched vial ID AND portion number
mirna_mrna_shared_ids <- mirna_mrna_shared_ids %>%
    filter(str_extract(id, portion_id_minus_analyte_regex)
           == str_extract(SampleID, portion_id_minus_analyte_regex))

mRNA normalized, batch corrected expression values for all samples are stored as a matrix in a TSV file (Synapse ID: syn4976369) and can be loaded with read_tsv().

Correlate data formatting

List of genes accessed here and saved as a TSV at data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv:

gene_correlate_file <- "../data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv"
gene_correlate_df <- read_tsv(gene_correlate_file)
Parsed with column specification:
cols(
  Gene = col_character(),
  `HGNC Symbol` = col_character(),
  `Entrez ID` = col_integer(),
  `Gene Family` = col_character(),
  `Immune Checkpoint` = col_character(),
  matched_name = col_character(),
  group = col_character()
)
3 parsing failures.
row col  expected    actual
 20  -- 7 columns 6 columns
 61  -- 7 columns 6 columns
 64  -- 7 columns 6 columns
mrna_corr_file <- "../data/intermediate/mrna_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(mrna_corr_file) | force_format) {
    # load normalized, batch-corrected expression data
    mrna_norm_file <- mrna_files %>% 
        filter(file.id == "syn4976369") %>% 
        .[["file_path"]]
    mrna_norm_df <- read_tsv(mrna_norm_file, progress = FALSE)
    
    # gene_list <- c("IFNG", "PRF1", "GZMA", "PDCD1", "CD274", "PDCD1LG2", "IL10", 
    #                "TGFB1", "IDO1", "HLA-A")  
    mrna_corr_df <- mrna_norm_df %>% 
        separate(gene_id, c("gene_name", "gene_id"), sep = "\\|") %>% 
        filter((gene_id %in% gene_correlate_df$`Entrez ID`) 
               | (gene_name %in% gene_correlate_df$`HGNC Symbol`)) %>% 
        select(one_of(c("gene_name", "gene_id", 
                        mirna_mrna_shared_ids$SampleID)))
    write_feather(mrna_corr_df, mrna_corr_file)
    rm(mrna_norm_df)
} else {
    mrna_corr_df <- read_feather(mrna_corr_file)
}

Disease-wise correlations

mirna_mrna_corr_file <- "../results/mirna_mrna_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_mrna_corr_file) | force_compute) {
    
    d_list <- mirna_sample_pt_df %>%
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(str_replace(names(mrna_corr_df),
                                  "(\\-[:alnum:]+){3}$", ""))
        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))
        
        target_df <- mrna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("gene_name", samples_d))) %>%
            dplyr::rename(correlate = gene_name) %>%
            gather(sample, x, -correlate) %>% 
            filter(!is.na(x))
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_mrna_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "mRNA expression")
    write_feather(mirna_mrna_corr_df, mirna_mrna_corr_file)
} else {
    mirna_mrna_corr_df <- read_feather(mirna_mrna_corr_file)
}

 

Summarize results

Add color palette

get_cet_pal <- function(cet_path) {
    read_lines(cet_path) %>%
        as.list() %>%
        map_chr(function(c) {
            c_rgb <- str_split_fixed(string = c, pattern = ",", n = 3)
            rgb(c_rgb[1], c_rgb[2], c_rgb[3], maxColorValue = 255)
        })
}
# here's an example with the diverging blue-to yellow colormap:
cet_path <- "~/Downloads/CETperceptual_csv_0_255/diverging-linear_bjy_30-90_c45_n256.csv"
my_cet_pal <- get_cet_pal(cet_path)

External miRNA data

Regulator miRs

mirna_causal_df <- readxl::read_excel("../data/causal_TCGA_panCancer_miRNA_immuneInfiltrate_3_15_2017.xlsx") %>% 
    .[, 1:9] %>% 
    mutate(mirna_disease = str_c(`miRNA Name`, `Tumor Type`, sep = "_"))

Known immune miRs

mirna_immune_df <- readxl::read_excel("../data/Paladini_immune_miRNAs.xlsx")
mirna_immune_tidy_df <- mirna_immune_df %>%
    mutate(mirna_group = str_split(MicroRNAs, ",")) %>%
    unnest(mirna_group) %>%
    mutate(mirna_group = str_trim(mirna_group, "both")) %>%
    # distinct(mirna_group) %>%
    mutate(mirna = mirna_group) %>%
    # filter(Category %in% c("Innate immunity", "Adaptive immunity")) %>%
    # filter(str_detect(mirna, "cluster")) %>%
    filter(!str_detect(mirna_group, "[0-9]+[a-z]{2,}")) %>% 
    mutate(mirna = str_replace(mirna, "miR-17/92 cluster", "miR-17,miR-18a,miR-19a,miR-20a,miR-19b-1,miR-92a-1"),
           mirna = str_replace(mirna, "miR-212/132 cluster", "miR-212,miR-132"),
           mirna = str_replace(mirna, "miR-10 family", "miR-10a,miR-10b,miR-99a,miR-99b,miR-100,miR-125a,miR-125b-1,miR-125b-2"),
           mirna = str_replace(mirna, "miR-221/222", "miR-221,miR-222"),
           mirna = str_replace(mirna, "miR-10a/b", "miR-10a,miR-10b"),
           mirna = str_replace(mirna, "miR-148/152", "miR-148,miR-152"),
           mirna = str_replace(mirna, "miR-17-5p/20a", "miR-17-5p,miR-20a"),
           mirna = str_replace(mirna, "miR-221/222 cluster", "miR-221,miR-222"),
           mirna = str_replace(mirna, "miR-181a/b", "miR-181a,miR-181b"),
           mirna = str_replace(mirna, "miR-15/16", "miR-15,miR-16"),
           mirna = str_replace(mirna, "miR-181 family", "miR-181a-1,miR-181a-2,miR-181b-1,miR-181b-2,miR-181c,miR-181d"),
           mirna = str_replace(mirna, "miR-130/301", "miR-130,miR-301"),
           mirna = str_replace(mirna, "miR-99a/miR-150", "miR-99a,miR-150"),
           mirna = str_replace(mirna, "Let", "let")) %>%
    mutate(mirna = str_split(mirna, ",")) %>% 
    unnest(mirna) %>% 
    # left_join(mirna_sig_corr_df %>%
    #               filter(correlate_type == "CIBERSORT fraction") %>%
    #               mutate(mirna_short = str_extract(
    #                   mirna, "(?<=hsa\\-)[:alnum:]+\\-[:alnum:]+")
    #                   ),
    #           by = c("mirna" = "mirna_short")) %>%
    # filter(!is.na(disease)) %>%
    # group_by(Cell_lineage, Cellular_process, mirna, group, label) %>%
    # tally() %>%
    # ungroup() %>%
    # filter(group == "Adaptive Immune Cells",
    #        str_detect(Cell_lineage, "^T"),
    #        str_detect(label, "^T")) %>%
    # distinct(Cell_lineage, label) %>%
    # mutate(Cell_lineage = str_replace(Cell_lineage, "T ", "T cells "),
    #        Cell_lineage = str_replace(Cell_lineage, " cells$", ""),
    #        label = str_replace_all(label, "\\.", " "),
    #        label = str_replace(label, "CD4", "helper"),
    #        label = str_replace(label, "CD8", "cytotoxic")) %>%
    # filter(!(label %in% Cell_lineage) & !(Cell_lineage %in% label)) %>%
    # mutate(lineage_match = str_detect(label, Cell_lineage)) %>%
    # group_by(Cell_lineage) %>%
    # summarize(num_matches = sum(lineage_match)) %>%
    I

Predicted binding targets

mirna_target_df <- read_tsv("../data/miRDB_v5.0_prediction_result.txt", col_names = FALSE) %>% 
    set_names(c("mirna", "gene", "score")) %>% 
    filter(str_detect(mirna, "hsa"))
mirna_mrna_target_df <- read_tsv("../data/synergizer.tsv", skip = 4) %>% 
    mutate(refseq_mrna = str_split(refseq_mrna, " ")) %>% 
    unnest(refseq_mrna) %>% 
    filter(refseq_mrna %in% mirna_target_df$gene) %>% 
    left_join(mirna_target_df, by = c("refseq_mrna" = "gene")) %>% 
    mutate(mirna_target = str_c(mirna, hgnc_symbol, sep = "_"))

 

Distribution of significant correlations across tumor types

Aggregate counts by correlate type

bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df, 
          mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
          mirna_mrna_corr_df) %>% 
    group_by(disease, correlate_type) %>% 
    summarise(num_correlations = length(estimate),
              significant = sum(p.adjust < 0.05, na.rm = TRUE),
              other = num_correlations - significant) %>% 
    gather(correlations, total, -disease, -correlate_type, -num_correlations) %>% 
    ggplot(aes(x = disease, y = total)) +
    geom_col(aes(fill = disease, alpha = correlations), colour = "slategray") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          axis.text.y = element_text(angle = 45),
          legend.position = "top") +
    scale_fill_manual(values = tcga_colors$Color) +
    scale_alpha_manual(values = c(0.4, 1)) +
    guides(fill = FALSE) +
    facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")

Defining more specific groups within correlate types

correlate_annotations <- mirna_mrna_corr_df %>% 
    select(disease, mirna, correlate, estimate, statistic, p.value, method, alternative, p.adjust, correlate_type) %>% 
    bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df, 
          mirna_mut_load_corr_df, .) %>% 
    distinct(correlate, correlate_type) %>% 
    left_join(gene_correlate_df,
              by = c("correlate" = "matched_name")) %>%  
    mutate(label = ifelse(correlate_type == "mRNA expression", 
                          `HGNC Symbol`, correlate),
           label = str_replace(correlate, "\\.\\.Tregs\\.", ""),
           label = ifelse(label == "leuk_frac", "Leukocyte Fraction", label),
           group = ifelse(correlate_type == "leukocyte fraction", "Total Immune Cells", group),
           group = ifelse(correlate_type == "mutation load", "Mutation Load", group),
           group = ifelse(correlate_type == "CIBERSORT fraction" & str_detect(label, "^(T|B)\\."), 
                          "Adaptive Immune Cells", group),
           group = ifelse(correlate_type == "CIBERSORT fraction" & !str_detect(label, "^(T|B)"), 
                          "Innate Immune Cells", group)) %>% 
    select(correlate, correlate_type, group, label)
mirna_sig_corr_df <- bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df, 
          mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
          mirna_mrna_corr_df) %>% 
    filter(!(disease %in% c("DLBC", "THYM", "LAML"))) %>% 
    filter(!is.na(p.adjust), 
           p.adjust < 0.05) %>% 
    mutate(disease = factor(disease, levels = tcga_colors$Disease)) %>% 
    left_join(correlate_annotations, by = c("correlate", "correlate_type")) %>%
    filter(!is.na(group))
mirna_sig_corr_df %>% 
    mutate(direction = ifelse(estimate > 0, "positive", "negative")) %>%
    group_by(disease, correlate_type, direction) %>% 
    tally() %>% 
    ungroup() %>% 
    mutate(count = ifelse(direction == "negative", -1 * n, n)) %>% 
    ggplot(aes(x = disease, y = count)) +
    geom_col(aes(fill = disease, alpha = direction), colour = "slategray") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "top") +
    scale_fill_manual(values = tcga_colors$Color) +
    scale_alpha_manual(values = c(0.4, 1)) +
    guides(fill = FALSE) +
    facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")

mirna_support_df <- mirna_sig_corr_df %>%
    select(disease, mirna, correlate, estimate, p.adjust, correlate_type,
           group, label) %>%
    mutate(mirna_target = str_c(mirna, correlate, sep = "_"),
           mirna_disease = str_c(mirna, disease, sep = "_"),
           mirna_short = str_extract(mirna, 
                                     "(?<=hsa\\-).*"),
           mirna_short_ambiguous = str_extract(mirna_short,
                                               "[:alnum:]+\\-[:alnum:]+")) %>% 
    mutate(significant = TRUE,
           immunomodulator = correlate_type == "mRNA expression",
           infiltrate = correlate_type %in% c("CIBERSORT fraction", "leukocyte fraction"),
           leukocyte = correlate_type == "leukocyte fraction",
           celltype = correlate_type == "CIBERSORT fraction",
           mirbase_support = (estimate < 0) & (correlate_type == "mRNA expression") &
               mirna_target %in% mirna_mrna_target_df$mirna_target,
           sygnal_support = (correlate_type %in% c("CIBERSORT fraction", "leukocyte fraction")) &
               mirna_disease %in% mirna_causal_df$mirna_disease,
           immune_support = mirna_short %in% mirna_immune_tidy_df$mirna |
               mirna_short_ambiguous %in% mirna_immune_tidy_df$mirna,
           strong = abs(estimate) > 0.5,
           immune_strong = immune_support & strong)
mirna_support_df %>% 
    gather(flag, status, significant, strong, immune_strong,
           immunomodulator, infiltrate, leukocyte, celltype,
           mirbase_support, sygnal_support, immune_support) %>% 
    group_by(mirna, flag) %>%
    summarize(support = n_distinct(disease[status])) %>%
    ungroup() %>% 
    filter(support > 0) %>% 
    group_by(flag) %>% 
    tally()
mirna_support_df %>% 
    gather(flag, status, significant, strong,
           immunomodulator, infiltrate, leukocyte, celltype,
           mirbase_support, sygnal_support, immune_support) %>% 
    group_by(disease, flag) %>%
    summarize(support = sum(status)) %>%
    ungroup() %>% 
    spread(flag, support) %>% 
    select(one_of(c("disease", "significant", "strong", "immune_support", 
             "immunomodulator", "mirbase_support",
             "infiltrate", "leukocyte", "celltype", "sygnal_support")))

Groups and correlates with greatest numbers of correlated miRNAs

mirna_sig_corr_df %>% 
    filter(!is.na(group)) %>% 
    group_by(correlate_type, group, label, disease) %>% 
    summarise(n_mirna = n_distinct(mirna)) %>% 
    ungroup() %>% 
    group_by(correlate_type, group, label) %>% 
    mutate(mean_mirna = mean(n_mirna)) %>% 
    ungroup() %>% 
    arrange(correlate_type, group, desc(mean_mirna)) %>% 
    mutate(group = fct_inorder(group),
           label = fct_inorder(label)) %>%
    ggplot(aes(x = label, y = n_mirna)) +
    geom_boxplot(aes(fill = group), 
                 outlier.size = 0, outlier.alpha = 0) +
    ggplot2::scale_fill_discrete() +
    guides(fill = guide_legend(title = NULL)) +
    ylab("Num. correlated miRNAs") +
    xlab("") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "top")

Frequently associated miRNAs

With leukocyte fraction(s) & causal prediction

mirna_leuk_common_df <- mirna_support_df %>%
    filter(sygnal_support & immune_support) %>% 
    group_by(mirna, disease) %>% 
    summarize(num_types = n_distinct(group)) %>% 
    ungroup() %>% 
    group_by(mirna) %>%
    mutate(nz_disease = n_distinct(disease[num_types > 0])) %>%
    ungroup() %>%
    filter(nz_disease > 2)
mirna_leuk_common_df %>% 
    ggplot(aes(x = disease, y = mirna)) +
    geom_tile(aes(fill = num_types)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot_colors <- tcga_colors %>% 
    filter(Disease %in% mirna_leuk_common_df$disease)
p <- mirna_support_df %>% 
    filter(sygnal_support & immune_support,
           mirna %in% mirna_leuk_common_df$mirna) %>% 
    mutate(group = fct_inorder(group),
           mirna = str_extract(mirna, "miR.*"),
           correlation = ifelse(estimate > 0, "positive", "negative"),
           disease = factor(disease, levels = plot_colors$Disease)) %>%
    ggplot(aes(y = label, x = disease)) +
    geom_tile(aes(fill = disease, colour = correlation), size = 0.4) +
    scale_fill_manual("Tumor Group", values = plot_colors$Color) +
    scale_colour_manual("Correlation", values = c(my_cet_pal[1], my_cet_pal[256])) +
    guides(fill = guide_legend(nrow = 3, byrow = TRUE),
           colour = guide_legend(override.aes = list(fill = "#CCCCCC"),
                                 reverse = TRUE)) +
    ylab("") +
    xlab("") +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          legend.position = "top",
          strip.text.x = element_text(face = "bold", angle = 90, hjust = 0),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          strip.background = element_blank()) +
    facet_grid(group ~ mirna, scales = "free", space = "free")
p

ggsave("../figures/mirna_leuk_supported_corr.png", p,
       width = 17, height = 8, units = "cm", dpi = 300, scale = 2)

With immunomodulators & predicted targets

mirna_immunomod_common_df %>% 
    ggplot(aes(x = disease, y = mirna)) +
    geom_tile(aes(fill = num_types)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot_colors <- tcga_colors %>% 
    filter(Disease %in% mirna_immunomod_common_df$disease)
p <- mirna_support_df %>% 
    filter(mirbase_support & immune_support,
           mirna %in% mirna_immunomod_common_df$mirna) %>% 
    mutate(group = fct_inorder(group),
           mirna = str_extract(mirna, "miR.*"),
           correlation = ifelse(estimate > 0, "positive", "negative"),
           disease = factor(disease, levels = plot_colors$Disease)) %>%
    ggplot(aes(y = label, x = disease)) +
    geom_tile(aes(fill = disease), 
              colour = my_cet_pal[1], size = 0.4) +
    scale_fill_manual("Tumor Group", values = plot_colors$Color) +
    guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
    ylab("") +
    xlab("") +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          legend.position = "top",
          strip.text.x = element_text(face = "bold", angle = 90, hjust = 0),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          strip.background = element_blank()) +
    facet_grid(group ~ mirna, scales = "free", space = "free")
p

ggsave("../figures/mirna_immunomod_supported_corr.png", p,
       width = 17, height = 5, units = "cm", dpi = 300, scale = 2)

All categories…

mirna_all_common_df <- mirna_support_df %>%
    group_by(mirna, disease) %>% 
    summarize(all_support = sum(mirbase_support) > 0 &
                  sum(sygnal_support) > 0 &
                  sum(immune_support) > 0) %>% 
    filter(all_support) %>% 
    group_by(mirna) %>%
    mutate(nz_disease = n_distinct(disease)) %>%
    filter(nz_disease > 1)
p <- mirna_support_df %>% 
    filter(mirna %in% mirna_all_common_df$mirna,
           correlate_type != "mutation load") %>% 
    mutate(support = (mirbase_support | sygnal_support) & immune_support,
           group = fct_inorder(group),
           mirna_short = str_extract(mirna, "miR.*")) %>%
    group_by(mirna, label) %>% 
    mutate(num_diseases = n_distinct(disease)) %>% 
    group_by(group) %>% 
    mutate(label = fct_reorder(label, num_diseases)) %>% 
    ungroup() %>% 
    ggplot(aes(y = label, x = disease)) +
    geom_tile(aes(fill = estimate, colour = support, size = support)) +
    scale_fill_gradientn("Correlation", colours = my_cet_pal) +
    scale_colour_manual("Support", values = c("#333333", "#E69F00")) +
    scale_size_manual("Support", values = c(0.1, 1)) +
    ylab("") +
    xlab("") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "top",
          strip.text.x = element_text(face = "bold"),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          strip.background = element_blank()) +
    facet_grid(group ~ mirna_short, scales = "free", space = "free")
p

ggsave("../figures/mir142_supported_corr.png", p,
       width = 16, height = 16, units = "cm", dpi = 300, scale = 2)
---
title: "PanCan IRWG miRNA Correlative Analysis"
author: "James A. Eddy"
date: '`r lubridate::today()`'
output:
  html_notebook:
    code_folding: hide
    fig_width: 8
    toc: yes
    toc_float: yes
---

# Summary

Exploring associations of miRNA expression levels across tumor types in TCGA with suggested correlates defined by the Immune Response Working Group:

+ Overall Leukocyte fraction  
+ Individual Relative CIBERSORT fraction  
+ Mutation Load  
+ TCR,BCR Diversity  
+ Expression of  
    + IFNG,IFN-gamma  
    + PRF1,Perforin  
    + GZMA,Granzyme A  
    + PDCD1,PD-1  
    + CD274,PD-L1  
    + PDCD1LG2,PD-L2  
    + IL10,IL-10  
    + TGFB1,TGF-beta  
    + IDO1,IDO  
    + HLA-A  


```{r setup, echo=FALSE, message=FALSE, warning=FALSE, include=FALSE, cache=FALSE}
# Synapse client
library(synapseClient)

# parallel computing
library(parallel)

# viz packages
library(ggthemes)
library(viridis)
library(ggbeeswarm)

# packages for general data munging, formatting
library(feather)
library(stringr)
library(forcats)
library(broom)
library(tidyverse)

my_theme_bw <- function() {
    theme_bw() +
        theme(axis.title = element_text(face = "bold", size = rel(0.9)),
              legend.title = element_text(face = "bold"),
              plot.title = element_text(face = "bold"))
}
ggplot2::theme_set(my_theme_bw())
scale_fill_discrete <- function(...) scale_fill_colorblind(...)
scale_fill_continuous <- function(...) scale_fill_viridis(...)
scale_colour_discrete <- function(...) scale_color_colorblind(...)
scale_colour_continuous <- function(...) scale_color_viridis(...)
```

-----

\  

# Prepare data

## Retrieve miRNA data from Synapse

Data are stored on Synapse in the folder `syn6171109`.

```{r, message=FALSE, warning=FALSE, include=FALSE}
if (!dir.exists("data")) {
    dir.create("data")
}
synapseLogin()
mirna_synfolder <- "syn6171109"
mirna_synfiles <- synapseQuery(
    sprintf('select * from file where parentId=="%s"', mirna_synfolder)
)

# download files and store data/paths in new data frame
mirna_files <- mirna_synfiles %>% 
    mutate(file_data = map(file.id, function(synid) {
        synGet(synid, downloadLocation = "../data/tcga/")
    }),
    file_path = map_chr(file_data, getFileLocation))
```

## Load miRNA sample data

Sample characteristics are stored in a tab-delimited text file (Synapse ID: `syn7222010`) and can be loaded with `read_tsv()`.

```{r}
# load sample data
mirna_sample_file <- mirna_files %>% 
    filter(file.id == "syn7222010") %>% 
    .[["file_path"]]
mirna_sample_df <- read_tsv(mirna_sample_file)
```

## Sample filtering

Sample Quality Annotations:
syn4551248 ("merged_sample_quality_annotations.tsv")

```{r}
sample_qual_file <- synGet("syn4551248", downloadLocation = "../data/tcga/")
sample_qual_df <- sample_qual_file %>% 
    getFileLocation() %>% 
    read_tsv()
```

remove samples based on `Do_not_use=True`, and remove cases with `AWG_excluded_because_of_pathology=True`

```{r}
# samples to exclude from all datasets
exclude_samples <- sample_qual_df %>% 
    mutate(AWG_excluded_because_of_pathology = parse_logical(AWG_excluded_because_of_pathology),
           Do_not_use = parse_logical(str_to_upper(Do_not_use))) %>% 
    filter(AWG_excluded_because_of_pathology | Do_not_use)
```

Remove samples from miRNA dataset.

```{r}
mirna_sample_df <- mirna_sample_df %>% 
    filter(!(id %in% exclude_samples$aliquot_barcode))
```

\  

## Load miRNA expression data

miRNA normalized, batch corrected expression values for all samples are stored as a matrix in a CSV file (Synapse ID: `syn7201053`) and can be loaded with `read_csv()`.

```{r warning=FALSE}
mirna_corr_file <- "../data/intermediate/mirna_correlate_data.feather"
force_read <- FALSE
if (!file.exists(mirna_corr_file) | force_read) {
    # load normalized, batch-corrected expression data
    mirna_norm_file <- mirna_files %>% 
        filter(file.id == "syn7201053") %>% 
        .[["file_path"]]
    mirna_norm_df <- read_csv(mirna_norm_file, progress = FALSE)
    
    mirna_corr_df <- mirna_norm_df %>% 
        select(one_of(c("Genes", mirna_sample_pt_df$id)))
    
    write_feather(mirna_corr_df, mirna_corr_file)
    rm(mirna_norm_df)
} else {
    mirna_corr_df <- read_feather(mirna_corr_file)
}
```

\  

## Set up plotting colors

```{r}
tcga_colors <- tribble(
    ~Color, ~Disease,
    "#ED2891", "BRCA",
    "#B2509E", "GBM",
    "#D49DC7", "LGG",
    "#C1A72F", "ACC",
    "#E8C51D", "PCPG",
    "#F9ED32", "THCA",
    "#104A7F", "CHOL",
    "#9EDDF9", "COAD",
    "#007EB5", "ESCA",
    "#CACCDB", "LIHC",
    "#6E7BA2", "PAAD",
    "#DAF1FC", "READ",
    "#00AEEF", "STAD",
    "#F6B667", "CESC",
    "#D97D25", "OV",
    "#FBE3C7", "UCEC",
    "#F89420", "UCS",
    "#97D1A9", "HNSC",
    "#009444", "UVM",
    "#754C29", "LAML",
    "#CEAC8F", "THYM",
    "#3953A4", "DLBC",
    "#BBD642", "SKCM",
    "#00A99D", "SARC",
    "#D3C3E0", "LUAD",
    "#A084BD", "LUSC",
    "#542C88", "MESO",
    "#FAD2D9", "BLCA",
    "#ED1C24", "KICH",
    "#F8AFB3", "KIRC",
    "#EA7075", "KIRP",
    "#7E1918", "PRAD",
    "#BE1E2D", "TGCT"
)

mirna_sample_df <- mirna_sample_df %>% 
    mutate(Disease = factor(Disease, levels = tcga_colors$Disease))
```

-----

\  

# Explore miRNA data

## Sample characteristics

The table `mirna_sample_df` contains `r ncol(mirna_sample_df)` columns describing the `r nrow(mirna_sample_df)` samples in the data. The `Sample_Type` column corresponds to TCGA **sample type codes**, which are defined [here](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes). The following sample types are included in the miRNA data:

```{r}
mirna_sample_df %>% 
    group_by(Sample_Type) %>% 
    tally()
```

\  

Based on the codes, this is the distribution of sample types:

```{r}
mirna_sample_df %>% 
    group_by(Sample_Type) %>% 
    tally() %>% 
    mutate(Definition = case_when(
        .$Sample_Type == 1 ~ "Primary Solid Tumor",
        .$Sample_Type == 2 ~ "Recurrent Solid Tumorr",
        .$Sample_Type == 3 ~ "Primary Blood Derived Cancer - Peripheral Blood",
        .$Sample_Type == 5 ~ "Additional - New Primary",
        .$Sample_Type == 6 ~ "Metastatic",
        .$Sample_Type == 7 ~ "Additional Metastatic",
        .$Sample_Type == 11 ~ "Solid Tissue Normal"
    ))
```

Note: only include "primary" tumor samples for now; also, add additional column to store vial ID (to ease mapping between data sets).

```{r}
mirna_sample_pt_df <- mirna_sample_df %>% 
    filter(Sample_Type %in% c(1, 3, 5)) %>% 
    mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", ""))
```

\  

Additionally, the following disease types are included:

```{r}
mirna_sample_pt_df %>% 
    group_by(Disease) %>% 
    tally()
```

Note: exclude LAML, THYM, and DLBC from cell content correlations

\  

And technical variables:

```{r}
mirna_sample_pt_df %>% 
    group_by(Protocol, Platform) %>% 
    tally()
```

\  

## miRNA expression values

Expression values in the data are reportedly reads per million (RPM). I've randomly selected a few samples from each disease type, sample type, protocol, and platform to inspect the distribution of expression values across all `r nrow(mirna_corr_df)` miRNA genes.

```{r}
set.seed(0)

# randomly select 1-3 samples from each combination of characteristics
sample_sub_df <- mirna_sample_pt_df %>% 
    group_by(Disease, Sample_Type, Protocol, Platform) %>% 
    sample_n(3, replace = TRUE) %>% 
    ungroup() %>% 
    distinct()

# subset and melt the expression data
mirna_sub_df <- mirna_corr_df %>% 
    select(one_of(c("Genes", sample_sub_df$id))) %>%
    gather("sample", "expression", -Genes)
```

\  

Even with a shifted log (`log10(x + 1)`) transformation, expression values look to be more exponentially distributed than normal.

```{r}
mirna_sub_df %>% 
    left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>% 
    ggplot(aes(x = log10(expression + 1))) +
    stat_density(aes(group = sample), geom = "line", position = "identity", 
                 alpha = 0.2)
```

\  

I can also look at the distribution of log-RPM values with boxplots:

```{r}
mirna_sub_df %>% 
    left_join(mirna_sample_pt_df, by = c("sample" = "id")) %>% 
    ggplot(aes(x = sample, y = log10(expression + 1))) +
    geom_boxplot(aes(fill = Disease), outlier.size = 0.5) +
    theme(axis.text.x = element_blank()) +
    scale_fill_manual(values = tcga_colors$Color)
```


```{r, warning=FALSE}
# convert expression df to matrix
mirna_corr_mat <- mirna_corr_df %>% 
    select(one_of(c("Genes", mirna_sample_pt_df$id))) %>% 
    column_to_rownames("Genes") %>% 
    as.matrix()
```
\  

## PCA

I used the `prcomp()` function on the transposed expression matrix (samples x genes, after transposing) to compute PCA data, which I can use to look for batch effects among samples.

```{r}
mirna_corr_pca <- mirna_corr_mat %>% 
    t() %>% 
    prcomp()
```

(I can use the `broom:tidy()` function to convert data in the `prcomp` object into data frames for `ggplot`.)

```{r, warning=FALSE}
pc_df <- mirna_corr_pca %>% 
    tidy("pcs")

mirna_corr_pca_df <- mirna_corr_pca %>% 
    tidy("samples") %>% 
    filter(PC <= 2) %>% 
    left_join(mirna_sample_pt_df, by = c("row" = "id"))

rm(mirna_corr_mat)
```

The plot below shows samples plotted as points along the first two principle components (PCs). Points are colored by disease type.

```{r}
pc1_label <- pc_df %>% 
    filter(PC == 1) %>% 
    transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>% 
    flatten_chr()

pc2_label <- pc_df %>% 
    filter(PC == 2) %>% 
    transmute(label = sprintf("PC%s [%0.2f%%]", PC, 100*percent)) %>% 
    flatten_chr()

mirna_corr_pca_df %>% 
    spread(PC, value) %>% 
    ggplot(aes(x = `1`, y = `2`)) +
    geom_point(aes(colour = Disease)) +
    xlab(pc1_label) +
    ylab(pc2_label) +
    scale_color_manual(values = tcga_colors$Color)
```

-----

\  

# Prepare correlate data

+ Overall Leukocyte fraction  
+ Individual Relative CIBERSORT fraction  
+ Mutation Load  
+ TCR,BCR Diversity  
+ Gene Expression

## Leukocyte fraction

### Retrieve/load data

Cellular Content: syn7994728
syn5808205 ("TCGA_all_leuk_estimate.masked.20170107.tsv")

```{r}
# file_data <- synGet("syn5808205", downloadLocation = "./")
leuk_frac_file <- "../data/tcga/TCGA_all_leuk_estimate.masked.20170107.tsv"
leuk_frac_df <- read_tsv(leuk_frac_file, col_names = FALSE) %>% 
    set_names(c("disease", "id", "leuk_frac"))
```

### Sample filtering

```{r}
leuk_frac_df <- leuk_frac_df %>%
    filter(!(id %in% exclude_samples$aliquot_barcode))
```


### Sample matching

Identify matched samples between miRNA and leukocyte fraction.

```{r}
leuk_frac_ids <- leuk_frac_df %>% 
    select(id) %>%
    mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>% 
    arrange()

mirna_ids <- mirna_sample_pt_df %>%
    select(id, vial_id) %>%
    arrange()

mirna_leuk_frac_shared_ids <- inner_join(mirna_ids, leuk_frac_ids, 
                                         by = "vial_id",
                                         suffix = c("_mirna", "_leuk_frac"))

# only keep samples with matched vial ID AND portion number
portion_id_minus_analyte_regex <- "([:alnum:]+\\-){4}[0-9]+"
# plate_id_only_regex <- "[:alnum:]+(?=([:alnum:]\\-[:alnum:]+){1}$)"
mirna_leuk_frac_shared_ids <- mirna_leuk_frac_shared_ids %>%
    filter((str_extract(id_mirna, portion_id_minus_analyte_regex)
            == str_extract(id_leuk_frac, portion_id_minus_analyte_regex)))
```

NOTE: several samples were assayed on multiple plates; average the leukocyte fraction across these before computing correlations with miRNA

### Correlate data formatting

```{r}
leuk_frac_corr_file <- "../data/intermediate/leuk_frac_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(leuk_frac_corr_file) | force_format) {
    leuk_frac_corr_df <- mirna_leuk_frac_shared_ids %>% 
        left_join(leuk_frac_df, by = c("id_leuk_frac" = "id")) %>% 
        group_by(disease, vial_id) %>% 
        summarise(leuk_frac = mean(leuk_frac)) %>% 
        ungroup()
    
    write_feather(leuk_frac_corr_df, leuk_frac_corr_file)
} else {
    leuk_frac_corr_df <- read_feather(leuk_frac_corr_file)
}
```

### Disease-wise correlations

```{r warning=FALSE}
mirna_leuk_frac_corr_file <- "../results/mirna_leuk_frac_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_leuk_frac_corr_file) | force_compute) {
    
    # skip immune cell cancers?
    d_list <- mirna_sample_pt_df %>% 
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>% 
        filter(Disease %in% leuk_frac_corr_df$disease) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(leuk_frac_corr_df$vial_id)

        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))
        
        target_df <- leuk_frac_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            dplyr::rename(sample = vial_id, x = leuk_frac) %>% 
            mutate(correlate = "leuk_frac")
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_leuk_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "leukocyte fraction")
    write_feather(mirna_leuk_frac_corr_df, mirna_leuk_frac_corr_file)
} else {
    mirna_leuk_frac_corr_df <- read_feather(mirna_leuk_frac_corr_file)
}
```

\  

## CIBERSORT fraction

### Retrieve/load data

Cellular Content: syn4991611
syn8024565 ("TCGA.cluster-by-CIBERSORT-relative.tsv")

or...?

syn7337221 ("TCGA.Kallisto.fullIDs.cibersort.relative.tsv")

```{r}
# ciber_frac_file <- "../data/tcga/TCGA.cluster-by-CIBERSORT-relative.tsv"
ciber_frac_file <- "../data/tcga/TCGA.Kallisto.fullIDs.cibersort.relative.tsv"

ciber_frac_df <- read_tsv(ciber_frac_file)
```

### Sample filtering

```{r}
ciber_frac_df <- ciber_frac_df %>%
    mutate(id = str_replace_all(SampleID, "\\.", "\\-")) %>% 
    filter(!(id %in% exclude_samples$aliquot_barcode))
```

### Sample matching

Identify matched samples between miRNA and CIBERSORT fraction.

```{r}
ciber_frac_ids <- ciber_frac_df %>% 
    select(id) %>%
    mutate(vial_id = str_replace(id, "(\\-[:alnum:]+){3}$", "")) %>% 
    arrange()

# only keep samples with matched vial ID AND portion number
mirna_ciber_frac_shared_ids <- inner_join(mirna_ids, ciber_frac_ids,
                                         by = "vial_id",
                                         suffix = c("_mirna", "_ciber_frac")) %>% 
    distinct() %>% 
    filter((str_extract(id_mirna, portion_id_minus_analyte_regex) 
            == str_extract(id_ciber_frac, portion_id_minus_analyte_regex)))
```

NOTE: several samples were assayed on multiple plates; average the CIBERSORT fraction across these before computing correlations with miRNA

### Correlate data formatting

```{r}
ciber_frac_corr_file <- "../data/intermediate/ciber_frac_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(ciber_frac_corr_file) | force_format) {
    ciber_frac_corr_df <- mirna_ciber_frac_shared_ids %>%
        left_join(ciber_frac_df, by = c("id_ciber_frac" = "id")) %>% 
        select(-id_mirna, -id_ciber_frac, 
               -SampleID, -P.value, -Correlation, -RMSE) %>% 
        group_by(CancerType, vial_id) %>%
        summarise_each(funs(mean)) %>% 
        ungroup()
    
    write_feather(ciber_frac_corr_df, ciber_frac_corr_file)
} else {
    ciber_frac_corr_df <- read_feather(ciber_frac_corr_file)
}
```

### Disease-wise correlations

```{r warning=FALSE}
mirna_ciber_frac_corr_file <- "../results/mirna_ciber_frac_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_ciber_frac_corr_file) | force_compute) {
    
    # skip immune cell cancers?
    d_list <- mirna_sample_pt_df %>%
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>% 
        filter(Disease %in% ciber_frac_corr_df$CancerType) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(ciber_frac_corr_df$vial_id)

        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))

        target_df <- ciber_frac_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            select(-CancerType) %>% 
            gather(correlate, x, -vial_id) %>% 
            dplyr::rename(sample = vial_id)
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_ciber_frac_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "CIBERSORT fraction")
    write_feather(mirna_ciber_frac_corr_df, mirna_ciber_frac_corr_file)
} else {
    mirna_ciber_frac_corr_df <- read_feather(mirna_ciber_frac_corr_file)
}
```

\  

## Mutation load

### Retrieve/load data

Mutation Load: syn5706827
syn7994728 ("mutation-load")

* exclude LAML?

```{r}
mut_load_file <- "../data/tcga/mutation-load-vial.txt"
mut_load_df <- read_tsv(mut_load_file)
```

### Sample filtering

Can't filter aliquots, as data only includes IDs specific to the level of vial

### Sample matching

Identify matched samples between miRNA and mutational load.

```{r}
mut_load_ids <- mut_load_df %>% 
    select(id = Tumor_Sample_ID_Vial) %>% 
    mutate(vial_id = id) %>%
    arrange()

mirna_mut_load_shared_ids <- inner_join(mirna_ids, mut_load_ids, 
                                        by = "vial_id",
                                        suffix = c("_mirna", "_mut_load"))
```

### Correlate data formatting

```{r}
mut_load_corr_file <- "../data/intermediate/mut_load_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(mut_load_corr_file) | force_format) {
    mut_load_corr_df <- mirna_mut_load_shared_ids %>% 
        left_join(mut_load_df, by = c("id_mut_load" = "Tumor_Sample_ID_Vial"))
    
    write_feather(mut_load_corr_df, mut_load_corr_file)
} else {
    mut_load_corr_df <- read_feather(mut_load_corr_file)
}
```

### Disease-wise correlations

```{r warning=FALSE}
mirna_mut_load_corr_file <- "../results/mirna_mut_load_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_mut_load_corr_file) | force_compute) {
    
    # skip immune cell cancers?
    d_list <- mirna_sample_pt_df %>%
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>%
        filter(Disease %in% mut_load_corr_df$cohort) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(mut_load_corr_df$vial_id)

        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))

        target_df <- mut_load_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            select(-id_mirna, -id_mut_load, -cohort, -Patient_ID, 
                   -Tumor_Sample_ID) %>% 
            gather(correlate, x, -vial_id) %>% 
            dplyr::rename(sample = vial_id)
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_mut_load_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "mutation load")
    write_feather(mirna_mut_load_corr_df, mirna_mut_load_corr_file)
} else {
    mirna_mut_load_corr_df <- read_feather(mirna_mut_load_corr_file)
}
```

\  

## TCR/BCR diversity

### Retrieve/load data

T Cell Receptor / Brown et al: syn5876488
syn7063422 ("mitcr_sampleStatistics_20160714.tsv")

```{r}
tcr_div_file <- "../data/tcga/mitcr_sampleStatistics_20160714.tsv"

tcr_div_df <- read_tsv(tcr_div_file)
```

### Sample filtering

```{r}
tcr_dv_df <- tcr_div_df %>%
    filter(!(AliquotBarcode %in% exclude_samples$aliquot_barcode))
```

### Sample matching

Identify matched samples between miRNA and CIBERSORT fraction.

```{r}
tcr_div_ids <- tcr_div_df %>% 
    select(id = AliquotBarcode, vial_id = SampleBarcode) %>% 
    arrange()

# only keep samples with matched vial ID AND portion number
mirna_tcr_div_shared_ids <- inner_join(mirna_ids, tcr_div_ids,
                                       by = "vial_id",
                                       suffix = c("_mirna", "_tcr_div")) %>% 
    distinct() %>% 
    filter((str_extract(id_mirna, portion_id_minus_analyte_regex) 
            == str_extract(id_tcr_div, portion_id_minus_analyte_regex)))
```

NOTE: several samples were assayed on multiple plates; average the TCR diversity across these before computing correlations with miRNA

### Correlate data formatting

Some samples (`AliquotBarcode`) have 2 values for Shannon entropy, apparently corresponding to separate analyses on CGHub. I'll go ahead and average these values per aliquot, prior to averaging Shannon values per vial ID. As a small measure of QC, I'll also only keep observations with at least 1 TCRA read or 1 TCRB read.

```{r}
tcr_div_corr_file <- "../data/intermediate/tcr_div_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(tcr_div_corr_file) | force_format) {
    tcr_div_corr_df <- mirna_tcr_div_shared_ids %>%
        left_join(tcr_div_df, by = c("id_tcr_div" = "AliquotBarcode")) %>%
        select(-CGHub_analysis_id) %>% 
        distinct() %>% 
        filter(totTCRa_reads > 0 | totTCRb_reads > 0) %>%
        group_by(Study, vial_id, id_tcr_div) %>%
        summarize(shannon = mean(shannon)) %>%
        group_by(Study, vial_id) %>%
        summarise(shannon = mean(shannon)) %>% 
        ungroup()

    write_feather(tcr_div_corr_df, tcr_div_corr_file)
} else {
    tcr_div_corr_df <- read_feather(tcr_div_corr_file)
}
```

### Disease-wise correlations

```{r warning=FALSE}
mirna_tcr_div_corr_file <- "../results/mirna_tcr_div_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_tcr_div_corr_file) | force_compute) {
    
    # skip immune cell cancers
    d_list <- mirna_sample_pt_df %>%
        # filter((!Disease %in% c("LAML", "THYM", "DLBC"))) %>% 
        filter(Disease %in% tcr_div_corr_df$Study) %>% 
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(tcr_div_corr_df$vial_id)

        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))

        target_df <- tcr_div_corr_df %>%
            filter(vial_id %in% samples_d) %>% 
            select(-Study) %>% 
            dplyr::rename(sample = vial_id) %>% 
            gather(correlate, x, -sample)
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_tcr_div_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "TCR diversity")
    write_feather(mirna_tcr_div_corr_df, mirna_tcr_div_corr_file)
} else {
    mirna_tcr_div_corr_df <- read_feather(mirna_tcr_div_corr_file)
}
```

\  

## Gene expression

+ Expression of  
    + IFNG,IFN-gamma  
    + PRF1,Perforin  
    + GZMA,Granzyme A  
    + PDCD1,PD-1  
    + CD274,PD-L1  
    + PDCD1LG2,PD-L2  
    + IL10,IL-10  
    + TGFB1,TGF-beta  
    + IDO1,IDO  
    + HLA-A  

### Retrieve/load mRNA data

Batch effects normalized mRNA data: syn4976363
syn4976369 ("EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv")
syn4976366 (" EB++GeneExpAnnotation.tsv")

```{r include=FALSE}
mrna_synfolder <- "syn4976363"
mrna_synfiles <- synapseQuery(
    sprintf('select * from file where parentId=="%s"', mrna_synfolder)
)

# download files and store data/paths in new data frame
mrna_files <- mrna_synfiles %>% 
    mutate(file_data = map(file.id, function(synid) {
        synGet(synid, downloadLocation = "../data/tcga")
    }),
    file_path = map_chr(file_data, getFileLocation))
```

Sample characteristics are stored in a tab-delimited text file (Synapse ID: `syn4976366`) and can be loaded with `read_tsv()`.

```{r}
# load sample data
mrna_sample_file <- mrna_files %>% 
    filter(file.id == "syn4976366") %>% 
    .[["file_path"]]
mrna_sample_df <- read_tsv(mrna_sample_file)
```

### Sample filtering

Remove samples from mRNA dataset.

```{r}
mrna_sample_df <- mrna_sample_df %>% 
    filter(!(SampleID %in% exclude_samples$aliquot_barcode))
```

### Sample matching

Identify matched samples between miRNA and mRNA.

```{r}
mrna_ids <- mrna_sample_df %>% 
    select(SampleID) %>%
    mutate(vial_id = str_replace(SampleID, "(\\-[:alnum:]+){3}$", "")) %>% 
    arrange()

mirna_mrna_shared_ids <- inner_join(mirna_ids, mrna_ids, by = "vial_id")

# only keep samples with matched vial ID AND portion number
mirna_mrna_shared_ids <- mirna_mrna_shared_ids %>%
    filter(str_extract(id, portion_id_minus_analyte_regex)
           == str_extract(SampleID, portion_id_minus_analyte_regex))
```

mRNA normalized, batch corrected expression values for all samples are stored as a matrix in a TSV file (Synapse ID: `syn4976369`) and can be loaded with `read_tsv()`.

### Correlate data formatting

List of genes accessed [here](https://docs.google.com/spreadsheets/d/1aqOXYsU1ubkbxIZI_5p8ZRootgOAT0KweMA3LvSZ7HY/edit#gid=0) and saved as a TSV at `data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv`:

```{r}
gene_correlate_file <- "../data/Cancer Immunomodulators - TCGA PanImmune Group - Direct Relationship.tsv"
gene_correlate_df <- read_tsv(gene_correlate_file)
```


```{r}
mrna_corr_file <- "../data/intermediate/mrna_correlates_for_mirna.feather"
force_format <- FALSE
if (!file.exists(mrna_corr_file) | force_format) {
    # load normalized, batch-corrected expression data
    mrna_norm_file <- mrna_files %>% 
        filter(file.id == "syn4976369") %>% 
        .[["file_path"]]
    mrna_norm_df <- read_tsv(mrna_norm_file, progress = FALSE)
    
    # gene_list <- c("IFNG", "PRF1", "GZMA", "PDCD1", "CD274", "PDCD1LG2", "IL10", 
    #                "TGFB1", "IDO1", "HLA-A")  
    mrna_corr_df <- mrna_norm_df %>% 
        separate(gene_id, c("gene_name", "gene_id"), sep = "\\|") %>% 
        filter((gene_id %in% gene_correlate_df$`Entrez ID`) 
               | (gene_name %in% gene_correlate_df$`HGNC Symbol`)) %>% 
        select(one_of(c("gene_name", "gene_id", 
                        mirna_mrna_shared_ids$SampleID)))
    write_feather(mrna_corr_df, mrna_corr_file)
    rm(mrna_norm_df)
} else {
    mrna_corr_df <- read_feather(mrna_corr_file)
}
```

### Disease-wise correlations

```{r message=FALSE, warning=FALSE}
mirna_mrna_corr_file <- "../results/mirna_mrna_correlation.feather"
force_compute <- FALSE
if (!file.exists(mirna_mrna_corr_file) | force_compute) {
    
    d_list <- mirna_sample_pt_df %>%
        distinct(Disease) %>% 
        mutate(Disease = as.character(Disease)) %>% 
        .$Disease %>%
        set_names(.) %>% 
        as.list()
    
    corr_df_list <- mclapply(d_list, mc.cores = 4, function(d) {
        samples_d <- mirna_sample_pt_df %>%
            filter(Disease == d) %>%
            select(vial_id) %>%
            flatten_chr() %>%
            intersect(str_replace(names(mrna_corr_df),
                                  "(\\-[:alnum:]+){3}$", ""))

        source_df <- mirna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("Genes", samples_d))) %>%
            dplyr::rename(mirna = Genes) %>%
            gather(sample, x, -mirna) %>% 
            filter(!is.na(x))
        
        target_df <- mrna_corr_df %>%
            set_names(str_replace(names(.), "(\\-[:alnum:]+){3}$", "")) %>%
            select(one_of(c("gene_name", samples_d))) %>%
            dplyr::rename(correlate = gene_name) %>%
            gather(sample, x, -correlate) %>% 
            filter(!is.na(x))
        
        corr_df <- inner_join(source_df, target_df,
                              by = "sample",
                              suffix = c("_source", "_target")) %>%
            group_by(mirna, correlate) %>%
            do(tidy(cor.test(.$x_source, .$x_target, method = "spearman"))) %>% 
            ungroup()
        corr_df[["p.adjust"]] <- p.adjust(corr_df$p.value, method = "BH")
        return(corr_df)
    })
    
    mirna_mrna_corr_df <- bind_rows(corr_df_list, .id = "disease") %>% 
        mutate(correlate_type = "mRNA expression")
    write_feather(mirna_mrna_corr_df, mirna_mrna_corr_file)
} else {
    mirna_mrna_corr_df <- read_feather(mirna_mrna_corr_file)
}
```

-----

\  

# Summarize results

## Add color palette

```{r}
get_cet_pal <- function(cet_path) {
    read_lines(cet_path) %>%
        as.list() %>%
        map_chr(function(c) {
            c_rgb <- str_split_fixed(string = c, pattern = ",", n = 3)
            rgb(c_rgb[1], c_rgb[2], c_rgb[3], maxColorValue = 255)
        })
}

# here's an example with the diverging blue-to yellow colormap:
cet_path <- "~/Downloads/CETperceptual_csv_0_255/diverging-linear_bjy_30-90_c45_n256.csv"
my_cet_pal <- get_cet_pal(cet_path)
```


## External miRNA data

### Regulator miRs

```{r}
mirna_causal_df <- readxl::read_excel("../data/causal_TCGA_panCancer_miRNA_immuneInfiltrate_3_15_2017.xlsx") %>% 
    .[, 1:9] %>% 
    mutate(mirna_disease = str_c(`miRNA Name`, `Tumor Type`, sep = "_"))
```

### Known immune miRs

```{r}
mirna_immune_df <- readxl::read_excel("../data/Paladini_immune_miRNAs.xlsx")
```

```{r}
mirna_immune_tidy_df <- mirna_immune_df %>%
    mutate(mirna_group = str_split(MicroRNAs, ",")) %>%
    unnest(mirna_group) %>%
    mutate(mirna_group = str_trim(mirna_group, "both")) %>%
    # distinct(mirna_group) %>%
    mutate(mirna = mirna_group) %>%
    # filter(Category %in% c("Innate immunity", "Adaptive immunity")) %>%
    # filter(str_detect(mirna, "cluster")) %>%
    filter(!str_detect(mirna_group, "[0-9]+[a-z]{2,}")) %>% 
    mutate(mirna = str_replace(mirna, "miR-17/92 cluster", "miR-17,miR-18a,miR-19a,miR-20a,miR-19b-1,miR-92a-1"),
           mirna = str_replace(mirna, "miR-212/132 cluster", "miR-212,miR-132"),
           mirna = str_replace(mirna, "miR-10 family", "miR-10a,miR-10b,miR-99a,miR-99b,miR-100,miR-125a,miR-125b-1,miR-125b-2"),
           mirna = str_replace(mirna, "miR-221/222", "miR-221,miR-222"),
           mirna = str_replace(mirna, "miR-10a/b", "miR-10a,miR-10b"),
           mirna = str_replace(mirna, "miR-148/152", "miR-148,miR-152"),
           mirna = str_replace(mirna, "miR-17-5p/20a", "miR-17-5p,miR-20a"),
           mirna = str_replace(mirna, "miR-221/222 cluster", "miR-221,miR-222"),
           mirna = str_replace(mirna, "miR-181a/b", "miR-181a,miR-181b"),
           mirna = str_replace(mirna, "miR-15/16", "miR-15,miR-16"),
           mirna = str_replace(mirna, "miR-181 family", "miR-181a-1,miR-181a-2,miR-181b-1,miR-181b-2,miR-181c,miR-181d"),
           mirna = str_replace(mirna, "miR-130/301", "miR-130,miR-301"),
           mirna = str_replace(mirna, "miR-99a/miR-150", "miR-99a,miR-150"),
           mirna = str_replace(mirna, "Let", "let")) %>%
    mutate(mirna = str_split(mirna, ",")) %>% 
    unnest(mirna) %>% 
    # left_join(mirna_sig_corr_df %>%
    #               filter(correlate_type == "CIBERSORT fraction") %>%
    #               mutate(mirna_short = str_extract(
    #                   mirna, "(?<=hsa\\-)[:alnum:]+\\-[:alnum:]+")
    #                   ),
    #           by = c("mirna" = "mirna_short")) %>%
    # filter(!is.na(disease)) %>%
    # group_by(Cell_lineage, Cellular_process, mirna, group, label) %>%
    # tally() %>%
    # ungroup() %>%
    # filter(group == "Adaptive Immune Cells",
    #        str_detect(Cell_lineage, "^T"),
    #        str_detect(label, "^T")) %>%
    # distinct(Cell_lineage, label) %>%
    # mutate(Cell_lineage = str_replace(Cell_lineage, "T ", "T cells "),
    #        Cell_lineage = str_replace(Cell_lineage, " cells$", ""),
    #        label = str_replace_all(label, "\\.", " "),
    #        label = str_replace(label, "CD4", "helper"),
    #        label = str_replace(label, "CD8", "cytotoxic")) %>%
    # filter(!(label %in% Cell_lineage) & !(Cell_lineage %in% label)) %>%
    # mutate(lineage_match = str_detect(label, Cell_lineage)) %>%
    # group_by(Cell_lineage) %>%
    # summarize(num_matches = sum(lineage_match)) %>%
    I
```

### Predicted binding targets

```{r}
mirna_target_df <- read_tsv("../data/miRDB_v5.0_prediction_result.txt", col_names = FALSE) %>% 
    set_names(c("mirna", "gene", "score")) %>% 
    filter(str_detect(mirna, "hsa"))
```

```{r}
mirna_mrna_target_df <- read_tsv("../data/synergizer.tsv", skip = 4) %>% 
    mutate(refseq_mrna = str_split(refseq_mrna, " ")) %>% 
    unnest(refseq_mrna) %>% 
    filter(refseq_mrna %in% mirna_target_df$gene) %>% 
    left_join(mirna_target_df, by = c("refseq_mrna" = "gene")) %>% 
    mutate(mirna_target = str_c(mirna, hgnc_symbol, sep = "_"))
```

\  

## Distribution of significant correlations across tumor types

### Aggregate counts by correlate type

```{r}
bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df, 
          mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
          mirna_mrna_corr_df) %>% 
    group_by(disease, correlate_type) %>% 
    summarise(num_correlations = length(estimate),
              significant = sum(p.adjust < 0.05, na.rm = TRUE),
              other = num_correlations - significant) %>% 
    gather(correlations, total, -disease, -correlate_type, -num_correlations) %>% 
    ggplot(aes(x = disease, y = total)) +
    geom_col(aes(fill = disease, alpha = correlations), colour = "slategray") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          axis.text.y = element_text(angle = 45),
          legend.position = "top") +
    scale_fill_manual(values = tcga_colors$Color) +
    scale_alpha_manual(values = c(0.4, 1)) +
    guides(fill = FALSE) +
    facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")
```

### Defining more specific groups within correlate types

```{r}
correlate_annotations <- mirna_mrna_corr_df %>% 
    select(disease, mirna, correlate, estimate, statistic, p.value, method, alternative, p.adjust, correlate_type) %>% 
    bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df, 
          mirna_mut_load_corr_df, .) %>% 
    distinct(correlate, correlate_type) %>% 
    left_join(gene_correlate_df,
              by = c("correlate" = "matched_name")) %>%  
    mutate(label = ifelse(correlate_type == "mRNA expression", 
                          `HGNC Symbol`, correlate),
           label = str_replace(correlate, "\\.\\.Tregs\\.", ""),
           label = ifelse(label == "leuk_frac", "Leukocyte Fraction", label),
           group = ifelse(correlate_type == "leukocyte fraction", "Total Immune Cells", group),
           group = ifelse(correlate_type == "mutation load", "Mutation Load", group),
           group = ifelse(correlate_type == "CIBERSORT fraction" & str_detect(label, "^(T|B)\\."), 
                          "Adaptive Immune Cells", group),
           group = ifelse(correlate_type == "CIBERSORT fraction" & !str_detect(label, "^(T|B)"), 
                          "Innate Immune Cells", group)) %>% 
    select(correlate, correlate_type, group, label)
```

```{r}
mirna_sig_corr_df <- bind_rows(mirna_leuk_frac_corr_df, mirna_ciber_frac_corr_df, 
          mirna_mut_load_corr_df, mirna_tcr_div_corr_df,
          mirna_mrna_corr_df) %>% 
    filter(!(disease %in% c("DLBC", "THYM", "LAML"))) %>% 
    filter(!is.na(p.adjust), 
           p.adjust < 0.05) %>% 
    mutate(disease = factor(disease, levels = tcga_colors$Disease)) %>% 
    left_join(correlate_annotations, by = c("correlate", "correlate_type")) %>%
    filter(!is.na(group))
```


```{r, fig.width=8, fig.height=5}
mirna_sig_corr_df %>% 
    mutate(direction = ifelse(estimate > 0, "positive", "negative")) %>%
    group_by(disease, correlate_type, direction) %>% 
    tally() %>% 
    ungroup() %>% 
    mutate(count = ifelse(direction == "negative", -1 * n, n)) %>% 
    ggplot(aes(x = disease, y = count)) +
    geom_col(aes(fill = disease, alpha = direction), colour = "slategray") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "top") +
    scale_fill_manual(values = tcga_colors$Color) +
    scale_alpha_manual(values = c(0.4, 1)) +
    guides(fill = FALSE) +
    facet_wrap(~ correlate_type, ncol = 2, scales = "free_y")
```

```{r}
mirna_support_df <- mirna_sig_corr_df %>%
    select(disease, mirna, correlate, estimate, p.adjust, correlate_type,
           group, label) %>%
    mutate(mirna_target = str_c(mirna, correlate, sep = "_"),
           mirna_disease = str_c(mirna, disease, sep = "_"),
           mirna_short = str_extract(mirna, 
                                     "(?<=hsa\\-).*"),
           mirna_short_ambiguous = str_extract(mirna_short,
                                               "[:alnum:]+\\-[:alnum:]+")) %>% 
    mutate(significant = TRUE,
           immunomodulator = correlate_type == "mRNA expression",
           infiltrate = correlate_type %in% c("CIBERSORT fraction", "leukocyte fraction"),
           leukocyte = correlate_type == "leukocyte fraction",
           celltype = correlate_type == "CIBERSORT fraction",
           mirbase_support = (estimate < 0) & (correlate_type == "mRNA expression") &
               mirna_target %in% mirna_mrna_target_df$mirna_target,
           sygnal_support = (correlate_type %in% c("CIBERSORT fraction", "leukocyte fraction")) &
               mirna_disease %in% mirna_causal_df$mirna_disease,
           immune_support = mirna_short %in% mirna_immune_tidy_df$mirna |
               mirna_short_ambiguous %in% mirna_immune_tidy_df$mirna,
           strong = abs(estimate) > 0.5,
           immune_strong = immune_support & strong)
```

```{r}
mirna_support_df %>% 
    gather(flag, status, significant, strong, immune_strong,
           immunomodulator, infiltrate, leukocyte, celltype,
           mirbase_support, sygnal_support, immune_support) %>% 
    group_by(mirna, flag) %>%
    summarize(support = n_distinct(disease[status])) %>%
    ungroup() %>% 
    filter(support > 0) %>% 
    group_by(flag) %>% 
    tally()
```

```{r}
mirna_support_df %>% 
    gather(flag, status, significant, strong,
           immunomodulator, infiltrate, leukocyte, celltype,
           mirbase_support, sygnal_support, immune_support) %>% 
    group_by(disease, flag) %>%
    summarize(support = sum(status)) %>%
    ungroup() %>% 
    spread(flag, support) %>% 
    select(one_of(c("disease", "significant", "strong", "immune_support", 
             "immunomodulator", "mirbase_support",
             "infiltrate", "leukocyte", "celltype", "sygnal_support")))
```


### Groups and correlates with greatest numbers of correlated miRNAs

```{r}
mirna_sig_corr_df %>% 
    filter(!is.na(group)) %>% 
    group_by(correlate_type, group, label, disease) %>% 
    summarise(n_mirna = n_distinct(mirna)) %>% 
    ungroup() %>% 
    group_by(correlate_type, group, label) %>% 
    mutate(mean_mirna = mean(n_mirna)) %>% 
    ungroup() %>% 
    arrange(correlate_type, group, desc(mean_mirna)) %>% 
    mutate(group = fct_inorder(group),
           label = fct_inorder(label)) %>%
    ggplot(aes(x = label, y = n_mirna)) +
    geom_boxplot(aes(fill = group), 
                 outlier.size = 0, outlier.alpha = 0) +
    ggplot2::scale_fill_discrete() +
    guides(fill = guide_legend(title = NULL)) +
    ylab("Num. correlated miRNAs") +
    xlab("") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "top")
```

## Frequently associated miRNAs

### With leukocyte fraction(s) & causal prediction

```{r}
mirna_leuk_common_df <- mirna_support_df %>%
    filter(sygnal_support & immune_support) %>% 
    group_by(mirna, disease) %>% 
    summarize(num_types = n_distinct(group)) %>% 
    ungroup() %>% 
    group_by(mirna) %>%
    mutate(nz_disease = n_distinct(disease[num_types > 0])) %>%
    ungroup() %>%
    filter(nz_disease > 2)
```

```{r}
mirna_leuk_common_df %>% 
    ggplot(aes(x = disease, y = mirna)) +
    geom_tile(aes(fill = num_types)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
```

```{r, fig.width=6, fig.height=3.5}
plot_colors <- tcga_colors %>% 
    filter(Disease %in% mirna_leuk_common_df$disease)

p <- mirna_support_df %>% 
    filter(sygnal_support & immune_support,
           mirna %in% mirna_leuk_common_df$mirna) %>% 
    mutate(group = fct_inorder(group),
           mirna = str_extract(mirna, "miR.*"),
           correlation = ifelse(estimate > 0, "positive", "negative"),
           disease = factor(disease, levels = plot_colors$Disease)) %>%
    ggplot(aes(y = label, x = disease)) +
    geom_tile(aes(fill = disease, colour = correlation), size = 0.4) +
    scale_fill_manual("Tumor Group", values = plot_colors$Color) +
    scale_colour_manual("Correlation", values = c(my_cet_pal[1], my_cet_pal[256])) +
    guides(fill = guide_legend(nrow = 3, byrow = TRUE),
           colour = guide_legend(override.aes = list(fill = "#CCCCCC"),
                                 reverse = TRUE)) +
    ylab("") +
    xlab("") +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          legend.position = "top",
          strip.text.x = element_text(face = "bold", angle = 90, hjust = 0),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          strip.background = element_blank()) +
    facet_grid(group ~ mirna, scales = "free", space = "free")
p

ggsave("../figures/mirna_leuk_supported_corr.png", p,
       width = 17, height = 8, units = "cm", dpi = 300, scale = 2)
```

### With immunomodulators & predicted targets

```{r}
mirna_immunomod_common_df <- mirna_support_df %>%
    filter(mirbase_support & immune_support) %>% 
    group_by(mirna, disease) %>% 
    summarize(num_types = n_distinct(group)) %>% 
    ungroup() %>% 
    group_by(mirna) %>%
    mutate(nz_disease = n_distinct(disease[num_types > 0])) %>%
    ungroup() %>%
    distinct(mirna)
    filter(nz_disease > 14)
```

```{r}
mirna_immunomod_common_df %>% 
    ggplot(aes(x = disease, y = mirna)) +
    geom_tile(aes(fill = num_types)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
```

```{r, fig.width=6, fig.height=3.5}
plot_colors <- tcga_colors %>% 
    filter(Disease %in% mirna_immunomod_common_df$disease)

p <- mirna_support_df %>% 
    filter(mirbase_support & immune_support,
           mirna %in% mirna_immunomod_common_df$mirna) %>% 
    mutate(group = fct_inorder(group),
           mirna = str_extract(mirna, "miR.*"),
           correlation = ifelse(estimate > 0, "positive", "negative"),
           disease = factor(disease, levels = plot_colors$Disease)) %>%
    ggplot(aes(y = label, x = disease)) +
    geom_tile(aes(fill = disease), 
              colour = my_cet_pal[1], size = 0.4) +
    scale_fill_manual("Tumor Group", values = plot_colors$Color) +
    guides(fill = guide_legend(nrow = 3, byrow = TRUE)) +
    ylab("") +
    xlab("") +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          legend.position = "top",
          strip.text.x = element_text(face = "bold", angle = 90, hjust = 0),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          strip.background = element_blank()) +
    facet_grid(group ~ mirna, scales = "free", space = "free")
p

ggsave("../figures/mirna_immunomod_supported_corr.png", p,
       width = 17, height = 5, units = "cm", dpi = 300, scale = 2)
```

### All categories...

```{r}
mirna_all_common_df <- mirna_support_df %>%
    group_by(mirna, disease) %>% 
    summarize(all_support = sum(mirbase_support) > 0 &
                  sum(sygnal_support) > 0 &
                  sum(immune_support) > 0) %>% 
    filter(all_support) %>% 
    group_by(mirna) %>%
    mutate(nz_disease = n_distinct(disease)) %>%
    filter(nz_disease > 1)
```

```{r, fig.width=5, fig.height=6}
p <- mirna_support_df %>% 
    filter(mirna %in% mirna_all_common_df$mirna,
           correlate_type != "mutation load") %>% 
    mutate(support = (mirbase_support | sygnal_support) & immune_support,
           group = fct_inorder(group),
           mirna_short = str_extract(mirna, "miR.*")) %>%
    group_by(mirna, label) %>% 
    mutate(num_diseases = n_distinct(disease)) %>% 
    group_by(group) %>% 
    mutate(label = fct_reorder(label, num_diseases)) %>% 
    ungroup() %>% 
    ggplot(aes(y = label, x = disease)) +
    geom_tile(aes(fill = estimate, colour = support, size = support)) +
    scale_fill_gradientn("Correlation", colours = my_cet_pal) +
    scale_colour_manual("Support", values = c("#333333", "#E69F00")) +
    scale_size_manual("Support", values = c(0.1, 1)) +
    ylab("") +
    xlab("") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "top",
          strip.text.x = element_text(face = "bold"),
          strip.text.y = element_text(angle = 0, face = "bold", hjust = 0),
          strip.background = element_blank()) +
    facet_grid(group ~ mirna_short, scales = "free", space = "free")
p

ggsave("../figures/mir142_supported_corr.png", p,
       width = 16, height = 16, units = "cm", dpi = 300, scale = 1.8)
```

